Skip to content

Commit bb917bd

Browse files
committed
Add vision model support to new server
The llamafiler v1 chat completions endpoint now lets you embed data URIs inside messages, which contain base64 encoded images, so --mmproj vision models (e.g. LLaVA) will be able to analyze them. My new web GUI now has support for image uploads too. They can be dragged and dropped just like GitHub's online markdown editor lets you do. You can also paste images.
1 parent a03b47e commit bb917bd

38 files changed

+1212
-251
lines changed

llamafile/chatbot_eval.cpp

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,17 @@
1616
// limitations under the License.
1717

1818
#include "chatbot.h"
19-
20-
#include <cassert>
21-
#include <string>
22-
#include <vector>
23-
19+
#include "llama.cpp/base64.h"
2420
#include "llama.cpp/common.h"
2521
#include "llama.cpp/llama.h"
2622
#include "llama.cpp/llava/llava.h"
2723
#include "llamafile/datauri.h"
2824
#include "llamafile/image.h"
25+
#include "llamafile/llama.h"
2926
#include "llamafile/string.h"
27+
#include <cassert>
28+
#include <string>
29+
#include <vector>
3030

3131
namespace lf {
3232
namespace chatbot {
@@ -60,7 +60,7 @@ bool eval_tokens(std::vector<llama_token> tokens) {
6060
return true;
6161
}
6262

63-
bool eval_image_embed(const struct llava_image_embed *image_embed) {
63+
bool eval_image_embed(const llava_image_embed *image_embed) {
6464
int N = image_embed->n_image_pos;
6565
if (tokens_used() + N > llama_n_ctx(g_ctx))
6666
return out_of_context(N);
@@ -113,7 +113,7 @@ bool eval_token(int id) {
113113
}
114114

115115
bool eval_plain_text(const std::string &str, bool add_special, bool parse_special) {
116-
return eval_tokens(llama_tokenize(g_model, str, add_special, parse_special));
116+
return eval_tokens(llamafile_tokenize(g_model, str, add_special, parse_special));
117117
}
118118

119119
bool eval_string(std::string_view s, bool add_special, bool parse_special) {
@@ -122,22 +122,26 @@ bool eval_string(std::string_view s, bool add_special, bool parse_special) {
122122
size_t pos = s.find("data:", i);
123123
if (pos == std::string_view::npos)
124124
return eval_plain_text(std::string(s), add_special, parse_special);
125+
i = pos + 5;
125126
DataUri uri;
126127
size_t end = uri.parse(s.substr(pos + 5));
127-
if (end == std::string_view::npos) {
128-
i = pos + 5;
128+
if (end == std::string_view::npos)
129129
continue;
130-
}
131-
std::string image = uri.decode();
132-
if (!is_image(image)) {
133-
i = pos + 5;
130+
if (!uri.mime.starts_with("image/"))
131+
continue;
132+
std::string image;
133+
try {
134+
image = uri.decode();
135+
} catch (const base64_error &e) {
134136
continue;
135137
}
138+
if (!is_image(image))
139+
continue;
136140
if (!eval_plain_text(std::string(s.substr(0, pos)), add_special, parse_special))
137141
return false;
138142
if (!eval_image(image))
139143
return false;
140-
s = s.substr(pos + 5 + end);
144+
s = s.substr(i + end);
141145
i = 0;
142146
}
143147
}

llamafile/chatbot_main.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "llama.cpp/server/server.h"
3030
#include "llamafile/color.h"
3131
#include "llamafile/compute.h"
32+
#include "llamafile/llama.h"
3233
#include "llamafile/string.h"
3334

3435
namespace lf {
@@ -69,7 +70,7 @@ std::string describe_compute(void) {
6970
std::string token_to_piece(const struct llama_context *ctx, llama_token token, bool special) {
7071
if (token == IMAGE_PLACEHOLDER_TOKEN)
7172
return "";
72-
return llama_token_to_piece(ctx, token, special);
73+
return llamafile_token_to_piece(ctx, token, special);
7374
}
7475

7576
void on_server_listening(const char *host, int port) {

llamafile/flags.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ int FLAG_batch = 2048;
6464
int FLAG_ctx_size = 8192;
6565
int FLAG_flash_attn = false;
6666
int FLAG_gpu = 0;
67-
int FLAG_http_ibuf_size = 1024 * 1024;
67+
int FLAG_http_ibuf_size = 5 * 1024 * 1024;
6868
int FLAG_http_obuf_size = 1024 * 1024;
6969
int FLAG_keepalive = 5;
7070
int FLAG_main_gpu = 0;

llamafile/llama.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,48 @@
1616
// limitations under the License.
1717

1818
#include "llama.h"
19+
#include "llama.cpp/llama.h"
20+
#include <cassert>
21+
#include <string>
22+
#include <vector>
1923

2024
int llamafile_token_eot(llama_model *model) {
2125
llama_token eot = llama_token_eot(model);
2226
if (eot != -1)
2327
return eot;
2428
return llama_token_eos(model);
2529
}
30+
31+
std::string llamafile_token_to_piece(const llama_context *ctx, llama_token token, bool special) {
32+
std::string piece;
33+
piece.resize(piece.capacity());
34+
const int n_chars =
35+
llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
36+
if (n_chars < 0) {
37+
piece.resize(-n_chars);
38+
int check =
39+
llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
40+
unassert(check == -n_chars);
41+
} else {
42+
piece.resize(n_chars);
43+
}
44+
return piece;
45+
}
46+
47+
std::vector<llama_token> llamafile_tokenize(const struct llama_model *model,
48+
const std::string_view &text, bool add_special,
49+
bool parse_special) {
50+
int n_tokens = text.size() + 2 * add_special;
51+
std::vector<llama_token> result(n_tokens);
52+
n_tokens = llama_tokenize(model, text.data(), text.size(), result.data(), result.size(),
53+
add_special, parse_special);
54+
if (n_tokens < 0) {
55+
result.resize(-n_tokens);
56+
int check = llama_tokenize(model, text.data(), text.size(), result.data(), result.size(),
57+
add_special, parse_special);
58+
unassert(check == -n_tokens);
59+
} else {
60+
result.resize(n_tokens);
61+
}
62+
return result;
63+
}

llamafile/llama.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
// limitations under the License.
1717

1818
#pragma once
19-
#include "llama.cpp/llama.h"
19+
#include <__fwd/string_view.h>
20+
#include <__fwd/vector.h>
2021

2122
// Many llama.cpp APIs take boolean parameters at the end. Please favor
2223
// passing these constants as arguments instead, for better readability
@@ -36,4 +37,10 @@
3637
#define RENDER_SPECIAL_TOKENS true
3738
#define DONT_RENDER_SPECIAL_TOKENS false
3839

40+
struct llama_model;
41+
struct llama_context;
42+
3943
int llamafile_token_eot(llama_model *);
44+
45+
std::string llamafile_token_to_piece(const llama_context *, int, bool);
46+
std::vector<int> llamafile_tokenize(const llama_model *, const std::string_view &, bool, bool);

llamafile/server/BUILD.mk

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,17 @@ LLAMAFILE_SERVER_ASSETS = $(wildcard llamafile/server/www/*)
1212

1313
$(LLAMAFILE_SERVER_OBJS): private CCFLAGS += -g
1414

15-
o/$(MODE)/llamafile/server/server.a: \
15+
o/$(MODE)/llamafile/server/server.a: \
1616
$(filter-out %_test.o,$(LLAMAFILE_SERVER_OBJS))
1717

18-
o/$(MODE)/llamafile/server/main: \
19-
o/$(MODE)/llamafile/server/main.o \
20-
o/$(MODE)/llamafile/server/main.1.asc.zip.o \
21-
o/$(MODE)/llamafile/server/server.a \
22-
o/$(MODE)/llama.cpp/llama.cpp.a \
23-
o/$(MODE)/llama.cpp/llava/llava.a \
24-
o/$(MODE)/double-conversion/double-conversion.a \
25-
o/$(MODE)/stb/stb.a \
18+
o/$(MODE)/llamafile/server/main: \
19+
o/$(MODE)/llamafile/server/main.o \
20+
o/$(MODE)/llamafile/server/main.1.asc.zip.o \
21+
o/$(MODE)/llamafile/server/server.a \
22+
o/$(MODE)/llama.cpp/llama.cpp.a \
23+
o/$(MODE)/llama.cpp/llava/llava.a \
24+
o/$(MODE)/double-conversion/double-conversion.a \
25+
o/$(MODE)/stb/stb.a \
2626
$(LLAMAFILE_SERVER_ASSETS:%=o/$(MODE)/%.zip.o)
2727

2828
# turn /zip/llamafile/server/www/...
@@ -31,24 +31,37 @@ $(LLAMAFILE_SERVER_ASSETS:%=o/$(MODE)/%.zip.o): private ZIPOBJ_FLAGS += -C2
3131

3232
$(LLAMAFILE_SERVER_OBJS): llamafile/server/BUILD.mk
3333

34-
o/$(MODE)/llamafile/server/fastjson_test: \
35-
o/$(MODE)/llamafile/server/fastjson_test.o \
36-
o/$(MODE)/llamafile/server/fastjson.o \
37-
o/$(MODE)/double-conversion/double-conversion.a \
34+
o/$(MODE)/llamafile/server/atom_test: \
35+
o/$(MODE)/llamafile/server/atom_test.o \
36+
o/$(MODE)/llamafile/server/atom.o \
37+
o/$(MODE)/llamafile/server/image.o \
3838

39-
o/$(MODE)/llamafile/server/json_test: \
40-
o/$(MODE)/llamafile/server/json_test.o \
41-
o/$(MODE)/llamafile/server/json.o \
42-
o/$(MODE)/llamafile/server/hextoint.o \
43-
o/$(MODE)/double-conversion/double-conversion.a \
39+
o/$(MODE)/llamafile/server/image_test: \
40+
o/$(MODE)/llamafile/server/image_test.o \
41+
o/$(MODE)/llamafile/server/image.o \
4442

45-
o/$(MODE)/llamafile/server/tokenbucket_test: \
46-
o/$(MODE)/llamafile/server/tokenbucket_test.o \
47-
o/$(MODE)/llamafile/server/tokenbucket.o \
48-
o/$(MODE)/llamafile/server/log.o \
49-
o/$(MODE)/llama.cpp/llama.cpp.a \
43+
o/$(MODE)/llamafile/server/fastjson_test: \
44+
o/$(MODE)/llamafile/server/fastjson_test.o \
45+
o/$(MODE)/llamafile/server/fastjson.o \
46+
o/$(MODE)/double-conversion/double-conversion.a \
47+
48+
o/$(MODE)/llamafile/server/json_test: \
49+
o/$(MODE)/llamafile/server/json_test.o \
50+
o/$(MODE)/llamafile/server/json.o \
51+
o/$(MODE)/llamafile/server/hextoint.o \
52+
o/$(MODE)/double-conversion/double-conversion.a \
53+
54+
o/$(MODE)/llamafile/server/tokenbucket_test: \
55+
o/$(MODE)/llamafile/server/tokenbucket_test.o \
56+
o/$(MODE)/llamafile/server/tokenbucket.o \
57+
o/$(MODE)/llamafile/server/log.o \
58+
o/$(MODE)/llama.cpp/llama.cpp.a \
5059

5160
.PHONY: o/$(MODE)/llamafile/server
52-
o/$(MODE)/llamafile/server: \
53-
o/$(MODE)/llamafile/server/main \
54-
o/$(MODE)/llamafile/server/json_test.runs \
61+
o/$(MODE)/llamafile/server: \
62+
o/$(MODE)/llamafile/server/main \
63+
o/$(MODE)/llamafile/server/atom_test.runs \
64+
o/$(MODE)/llamafile/server/fastjson_test.runs \
65+
o/$(MODE)/llamafile/server/image_test.runs \
66+
o/$(MODE)/llamafile/server/json_test.runs \
67+
o/$(MODE)/llamafile/server/tokenbucket_test.runs \

llamafile/server/atob.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
// limitations under the License.
1717

1818
#include "utils.h"
19-
2019
#include <string_view>
2120

2221
bool

0 commit comments

Comments
 (0)